{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Using TensorFlow backend.\n"
     ]
    }
   ],
   "source": [
    "import re\n",
    "import html\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from bs4 import BeautifulSoup\n",
    "from nltk.corpus import stopwords\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from conf import config"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>label</th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>ham</td>\n",
       "      <td>Ü collecting ur laptop then going to configure...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>ham</td>\n",
       "      <td>Sorry, I can't text &amp;amp; drive coherently, se...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>spam</td>\n",
       "      <td>PRIVATE! Your 2003 Account Statement for shows...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>ham</td>\n",
       "      <td>What's up. Do you want me to come online?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>ham</td>\n",
       "      <td>The guy did some bitching but I acted like i'd...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "  label                                               text\n",
       "0   ham  Ü collecting ur laptop then going to configure...\n",
       "1   ham  Sorry, I can't text &amp; drive coherently, se...\n",
       "2  spam  PRIVATE! Your 2003 Account Statement for shows...\n",
       "3   ham          What's up. Do you want me to come online?\n",
       "4   ham  The guy did some bitching but I acted like i'd..."
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train = pd.read_csv('../data/origin_data/sms_train.txt', sep='\\t', header=None, names=['label', 'text'])\n",
    "train.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>Aiyo a bit pai seh ü noe... Scared he dun rem ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Are you driving or training?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>It means u could not keep ur words.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>Babe, I'm back ... Come back to me ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>I don't think I can get away for a trek that l...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                                                text\n",
       "0  Aiyo a bit pai seh ü noe... Scared he dun rem ...\n",
       "1                       Are you driving or training?\n",
       "2                It means u could not keep ur words.\n",
       "3             Babe, I'm back ... Come back to me ...\n",
       "4  I don't think I can get away for a trek that l..."
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "test = pd.read_csv('../data/origin_data/sms_test.txt', sep='\\t', header=None, names=['text'])\n",
    "test.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clean_text(origin_text):\n",
    "    # 去掉html标签\n",
    "    text = BeautifulSoup(origin_text).get_text()\n",
    "    # 去掉标点符号和非法字符\n",
    "    text = re.sub(\"[^a-zA-Z]\",\" \",text)\n",
    "    # 将字符全部转化为小写，并通过空格符进行分词处理\n",
    "    words = text.lower().split()\n",
    "    # 去停用词\n",
    "    stop_words = set(stopwords.words(\"english\"))\n",
    "    meaningful_words = [w for w in words if w not in stop_words]\n",
    "    # 将剩下的词还原成str类型\n",
    "    cleaned_text = \" \".join(meaningful_words)\n",
    "    return cleaned_text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "train['text'] = train['text'].apply(lambda x: clean_text(x))\n",
    "test['text'] = test['text'].apply(lambda x: clean_text(x))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "73\n"
     ]
    }
   ],
   "source": [
    "texts = train.text.tolist()\n",
    "max_len = 0\n",
    "df_len_list = []\n",
    "for i in range(len(texts)):\n",
    "    text = []\n",
    "    text = texts[i].split(' ')\n",
    "#     if max_len < len(text):\n",
    "#         print(text)\n",
    "    df_len_list.append(len(text))\n",
    "    max_len = max(max_len, len(text))\n",
    "print(max_len)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "51"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "sorted_len = sorted(df_len_list)\n",
    "sorted_len[int(len(sorted_len)*0.999)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ham     3041\n",
       "spam     407\n",
       "Name: label, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train.label.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "train.to_csv(config.train_path, index=False)\n",
    "test.to_csv(config.test_path, index=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# all_df = pd.read_csv('../data/origin_data/SMSSpamCollection.txt', sep='\\t', header=None, names=['label', 'text'])\n",
    "# test = pd.read_csv('../data/origin_data/sms_test.txt', sep='\\t', header=None, names=['text'])\n",
    "# all_df.drop_duplicates('text', keep='first', inplace=True)\n",
    "# all_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# answer = pd.merge(test, all_df, how='left', on='text')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "# answer.to_csv('../data/origin_data/answer.csv', index=False)\n",
    "# answer.head()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
